import pytesseract
import cv2
from PIL import Image
import sys
from pdf2image import convert_from_path

# # 指定tesseract.exe的路径（仅Windows需要）
pytesseract.pytesseract.tesseract_cmd = r'D:\rjbao\tesseract\cx\tesseract.exe'  # 示例路径，请根据实际情况修改
#
# # im =cv2.imread(r'D:\Desktop\图片\1741143908881.png',cv2.IMREAD_COLOR)
# # image = Image.open(r'D:\Desktop\图片\高山杨山河施工合同.pdf')

images = convert_from_path(pdf_path=r'D:\Desktop\图片\高山杨山河施工合同.pdf',poppler_path=r'D:\rjbao\poppler\poppler-24.08.0\Library\bin',fmt="ppm",dpi=600)


print(len(images))

# config=('-l chi_sim --oem 1 --psm 3')
texts = []
for image in images:
    # 将图像转换为字节流，以便传递给pytesseract
    # image_bytes = image.tobytes()
    # 使用py  texts.append(text)
    #     print(f"Page {i+1}: {text}")tesseract提取文本
    # config=('-l chi_sim --oem 3 --psm 6')
    config = ('-l chi_sim --oem 3 --psm 6')
    image = image.convert('L')
    text = pytesseract.image_to_string(image, config=config)  # 'eng'是英语语言包，根据需要调整
    print(text)




# text =pytesseract.image_to_string(images[0], config=('-l chi_sim --oem 1 --psm 3'))
# print(text)







